{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Final Classifying Movie Scripts : Predict The Movie Genre.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "machine_shape": "hm"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9jLebI-OhcY-",
        "colab_type": "text"
      },
      "source": [
        "##Import Required Files"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IiipiLoq196Q",
        "colab_type": "code",
        "outputId": "eaa47d55-63f2-4ee2-c6f4-05a5e4be1c85",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 128
        }
      },
      "source": [
        "import os\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import xgboost as xgb\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.decomposition import TruncatedSVD\n",
        "from sklearn import  metrics, naive_bayes\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.model_selection import GridSearchCV\n",
        "from sklearn.model_selection import StratifiedKFold\n",
        "\n",
        "# Load the Drive helper and mount\n",
        "from google.colab import drive\n",
        "\n",
        "# This will prompt for authorizati on.\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "eng_stopwords = nltk.download('stopwords')\n",
        "pd.options.mode.chained_assignment = None\n"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
            "\n",
            "Enter your authorization code:\n",
            "··········\n",
            "Mounted at /content/drive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gg0X6xeLh4l1",
        "colab_type": "text"
      },
      "source": [
        "###Functions to train and predict for new dataset ."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QsLAMe8n4_4v",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Naive Bayes on Word Tfidf Vectorizer:\n",
        "def runMNB(train_X, train_y, test_X, test_y, test_X2):\n",
        "    model = naive_bayes.MultinomialNB()\n",
        "    model.fit(train_X, train_y)\n",
        "    pred_test_y = model.predict_proba(test_X)\n",
        "    pred_test_y2 = model.predict_proba(test_X2)\n",
        "    #print(pred_test_y.shape, pred_test_y2.shape)\n",
        "    return pred_test_y, pred_test_y2, model\n",
        "\n",
        "def log_reg(train_X, train_y, test_X, test_y, test_X2):\n",
        "    weights = dict(zip((pd.Series(train_y).value_counts().index).astype(int), (pd.Series(train_y).value_counts(normalize=True).values)))\n",
        "    #print(weights)\n",
        "    model = LogisticRegression(C=1.0,multi_class = 'multinomial',\n",
        "                               solver = 'newton-cg',class_weight=weights)\n",
        "    model.fit(train_X, train_y)\n",
        "    pred_test_y = model.predict_proba(test_X)\n",
        "    pred_test_y2 = model.predict_proba(test_X2)\n",
        "    #print(pred_test_y.shape, pred_test_y2.shape)\n",
        "    return pred_test_y, pred_test_y2, model\n",
        "\n",
        "def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):\n",
        "    param = {}\n",
        "    param['objective'] = 'multi:softprob'\n",
        "    param['eta'] = 0.1\n",
        "    param['max_depth'] = 10\n",
        "    param['silent'] = 1\n",
        "    param['num_class'] = len(np.unique(train_y))\n",
        "    param['eval_metric'] = \"mlogloss\"\n",
        "    param['min_child_weight'] = child\n",
        "    param['subsample'] = 0.8\n",
        "    param['colsample_bytree'] = colsample\n",
        "    param['seed'] = seed_val\n",
        "    num_rounds = 10000\n",
        "\n",
        "    plst = list(param.items())\n",
        "    xgtrain = xgb.DMatrix(train_X, label=train_y)\n",
        "\n",
        "    if test_y is not None:\n",
        "        xgtest = xgb.DMatrix(test_X, label=test_y)\n",
        "        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]\n",
        "        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)\n",
        "    else:\n",
        "        xgtest = xgb.DMatrix(test_X)\n",
        "        model = xgb.train(plst, xgtrain, num_rounds)\n",
        "\n",
        "    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)\n",
        "    if test_X2 is not None:\n",
        "        xgtest2 = xgb.DMatrix(test_X2)\n",
        "        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)\n",
        "    return pred_test_y, pred_test_y2, model"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fz5RW_m-ipig",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#lets load dataset into the environment\n",
        "all_data = pd.read_feather('drive/My Drive/datasets/cleaned_data.ft')\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2SvAPK6X2aCD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#Split into train and test dataframes\n",
        "train = all_data[~all_data.isna().any(axis=1)]\n",
        "test = all_data[all_data.isna().any(axis=1)]\n",
        "\n",
        "#lets upsample the underrepresented class \n",
        "for i in train['Labels'].value_counts().index[train['Labels'].value_counts().values <10]:\n",
        "  while(train[train['Labels'] == i].shape[0]<10):\n",
        "    train = train.append(train[train['Labels'] == i].reset_index(drop=True))\n",
        "#train['Script'] = train['Script'].apply(remove_start)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JBPGM8OhHqKc",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3), analyzer='word',max_df =.95,\\\n",
        "                            use_idf=1, sublinear_tf=1, min_df=3, max_features=20000, strip_accents='ascii')\n",
        "full_tfidf = tfidf_vec.fit_transform(train['Script'].values.tolist() + test['Script'].values.tolist())\n",
        "train_tfidf = tfidf_vec.transform(train['Script'].values.tolist())\n",
        "test_tfidf = tfidf_vec.transform(test['Script'].values.tolist())\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MtqMutRLrsbR",
        "colab_type": "text"
      },
      "source": [
        "Probabilities of Naive bayes classifier are used as features for ensemble learning at the end"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Cdm4hnxhH3RQ",
        "colab_type": "code",
        "outputId": "f7fd6809-b2cb-496e-e813-93867cedc701",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "cv_scores = []\n",
        "pred_full_test = 0\n",
        "kfold_splits = 3\n",
        "pred_train = np.zeros([train.shape[0], len(train['Labels'].value_counts())])\n",
        "kf = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=2017)\n",
        "for dev_index, val_index in kf.split(train,train['Labels']):\n",
        "    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]\n",
        "    dev_y, val_y = train['Labels'].iloc[dev_index], train['Labels'].iloc[val_index]\n",
        "    pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)\n",
        "    pred_full_test = pred_full_test + pred_test_y\n",
        "    pred_train[val_index,:] = pred_val_y\n",
        "    cv_scores.append(metrics.log_loss(val_y, pred_val_y))\n",
        "pred_full_test = pred_full_test / kfold_splits \n",
        "\n",
        "  "
      ],
      "execution_count": 93,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mean cv score :  3.6411891659365083\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "X4p59PmFvISW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "final_dftrain = pd.DataFrame()\n",
        "final_dftest = pd.DataFrame()\n",
        "# Add the predictions of naivebayes classifier as new features \n",
        "for i in range(0,22,1):\n",
        "  final_dftrain[\"nb_tfidf_char\"+str(i)] = pred_train[:,i]\n",
        "  final_dftest[\"nb_tfidf_char\"+str(i)] = pred_full_test[:,i]\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_oicNHqIsIfL",
        "colab_type": "text"
      },
      "source": [
        "Probabilities of logistic regression classifier are used as features for ensemble learning at the end "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ROUYOzyQryO8",
        "colab_type": "code",
        "outputId": "432d0439-49da-496c-815c-3eb24a3176f0",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "cv_scores = []\n",
        "pred_full_test = 0\n",
        "kfold_splits = 3\n",
        "pred_train = np.zeros([train.shape[0], len(train['Labels'].value_counts())])\n",
        "kf = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=2017)\n",
        "for dev_index, val_index in kf.split(train,train['Labels']):\n",
        "    dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]\n",
        "    dev_y, val_y = train['Labels'].iloc[dev_index], train['Labels'].iloc[val_index]\n",
        "    pred_val_y, pred_test_y, model = log_reg(dev_X, dev_y, val_X, val_y, test_tfidf)\n",
        "    pred_full_test = pred_full_test + pred_test_y\n",
        "    pred_train[val_index,:] = pred_val_y\n",
        "    cv_scores.append(metrics.log_loss(val_y, pred_val_y))\n",
        "pred_full_test = pred_full_test / kfold_splits\n",
        "\n",
        "\n",
        "# add the predictions of logistic regression as  new features to learn at the end#\n",
        "for i in range(0,22,1):\n",
        "  final_dftrain[\"lr_tfidf_char\"+str(i)] = pred_train[:,i]\n",
        "  final_dftest[\"lr_tfidf_char\"+str(i)] = pred_full_test[:,i]\n"
      ],
      "execution_count": 96,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mean cv score :  2.784481302166791\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lQ4DqgQB0hks",
        "colab_type": "text"
      },
      "source": [
        "Create 40 components with help of  SVD ,to use in the final ensemble model"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KYdhBaxPMF2j",
        "colab_type": "code",
        "outputId": "a6323d59-22e8-40f7-c9b8-46c13db4d0a5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "n_comp = 40\n",
        "svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')\n",
        "svd_obj.fit(full_tfidf)\n",
        "train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))\n",
        "test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))\n",
        "    \n",
        "train_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]\n",
        "test_svd.columns = ['svd_word_'+str(i) for i in range(n_comp)]\n",
        "train_svd.shape,test_svd.shape"
      ],
      "execution_count": 97,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "((2065, 40), (849, 40))"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 97
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "a8VWvtIP2TJD",
        "colab_type": "code",
        "outputId": "fb1fd9a3-4dc6-422c-d7d6-3f8706ee47cd",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "train_df = pd.concat([train.reset_index(drop=True), train_svd], axis=1)\n",
        "train_df = pd.concat([train_df, final_dftrain], axis=1)\n",
        "\n",
        "test_df = pd.concat([test.reset_index(drop=True), test_svd], axis=1)\n",
        "test_df = pd.concat([test_df, final_dftest], axis=1)\n",
        "\n",
        "cols_to_drop = ['File_Name', 'Script']\n",
        "train_X = train_df.drop(cols_to_drop+['Labels'], axis=1)\n",
        "test_X = test_df.drop(cols_to_drop+['Labels'], axis=1)\n"
      ],
      "execution_count": 98,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "((2065, 3), (2065, 40), (849, 87))"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 98
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZO1et6z5k9mQ",
        "colab_type": "text"
      },
      "source": [
        "###Joined the outputs of  naive bayes & logistic regression probabilities with SVD components."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2x471ibz2W1p",
        "colab_type": "code",
        "outputId": "7de4e716-8704-4b15-ec7d-b314784f7bde",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 235
        }
      },
      "source": [
        "kfold_splits = 3\n",
        "kf = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=2017)\n",
        "cv_scores = []\n",
        "pred_full_test = 0\n",
        "pred_train = np.zeros([train.shape[0], len(train['Labels'].value_counts())])\n",
        "for dev_index, val_index in kf.split(train_X,train_df['Labels']):\n",
        "    dev_X, val_X = train_X.loc[dev_index], train_X.loc[val_index]\n",
        "    dev_y, val_y = train['Labels'].iloc[dev_index], train['Labels'].iloc[val_index]\n",
        "    pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, test_X, seed_val=0, colsample=0.7)\n",
        "    pred_full_test = pred_full_test + pred_test_y\n",
        "    pred_train[val_index,:] = pred_val_y\n",
        "    cv_scores.append(metrics.log_loss(val_y, pred_val_y))\n",
        "    break\n",
        "    \n",
        "pred_full_test = pred_full_test / kfold_splits"
      ],
      "execution_count": 103,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[0]\ttrain-mlogloss:2.83586\ttest-mlogloss:2.95544\n",
            "Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.\n",
            "\n",
            "Will train until test-mlogloss hasn't improved in 50 rounds.\n",
            "[20]\ttrain-mlogloss:1.00221\ttest-mlogloss:2.31458\n",
            "[40]\ttrain-mlogloss:0.587272\ttest-mlogloss:2.25595\n",
            "[60]\ttrain-mlogloss:0.453294\ttest-mlogloss:2.30448\n",
            "[80]\ttrain-mlogloss:0.400465\ttest-mlogloss:2.37632\n",
            "Stopping. Best iteration:\n",
            "[42]\ttrain-mlogloss:0.566899\ttest-mlogloss:2.25492\n",
            "\n",
            "cv scores :  [2.2549159187277446]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qUqLtjy_kcaZ",
        "colab_type": "text"
      },
      "source": [
        "###lets convert the output dataframe into submission form  as provided in sample.xlsx"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "q6I9XbbCxc5H",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 128
        },
        "outputId": "f0c1e414-bcfe-446e-e3a7-0581c168b1ba"
      },
      "source": [
        "test_set_preds = pd.DataFrame(pred_full_test)\n",
        "test_set_preds.reset_index(inplace=True,drop=True)\n",
        "final_submission = pd.merge(test['File_Name'],test_set_preds,on=test_set_preds.index).drop('key_0',axis=1)\n",
        "final_submission.to_excel('test_set_preds_4.xlsx', index=False)\n",
        "final_submission.head(2)"
      ],
      "execution_count": 104,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>File_Name</th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "      <th>2</th>\n",
              "      <th>3</th>\n",
              "      <th>4</th>\n",
              "      <th>5</th>\n",
              "      <th>6</th>\n",
              "      <th>7</th>\n",
              "      <th>8</th>\n",
              "      <th>9</th>\n",
              "      <th>10</th>\n",
              "      <th>11</th>\n",
              "      <th>12</th>\n",
              "      <th>13</th>\n",
              "      <th>14</th>\n",
              "      <th>15</th>\n",
              "      <th>16</th>\n",
              "      <th>17</th>\n",
              "      <th>18</th>\n",
              "      <th>19</th>\n",
              "      <th>20</th>\n",
              "      <th>21</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>file_2300.txt</td>\n",
              "      <td>0.004492</td>\n",
              "      <td>0.003791</td>\n",
              "      <td>0.003594</td>\n",
              "      <td>0.003557</td>\n",
              "      <td>0.023252</td>\n",
              "      <td>0.005760</td>\n",
              "      <td>0.100751</td>\n",
              "      <td>0.003987</td>\n",
              "      <td>0.00383</td>\n",
              "      <td>0.003552</td>\n",
              "      <td>0.003557</td>\n",
              "      <td>0.011290</td>\n",
              "      <td>0.004565</td>\n",
              "      <td>0.003607</td>\n",
              "      <td>0.007332</td>\n",
              "      <td>0.009261</td>\n",
              "      <td>0.004350</td>\n",
              "      <td>0.003565</td>\n",
              "      <td>0.003565</td>\n",
              "      <td>0.115861</td>\n",
              "      <td>0.005520</td>\n",
              "      <td>0.004294</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>file_809.txt</td>\n",
              "      <td>0.002903</td>\n",
              "      <td>0.002979</td>\n",
              "      <td>0.002732</td>\n",
              "      <td>0.002546</td>\n",
              "      <td>0.129413</td>\n",
              "      <td>0.004127</td>\n",
              "      <td>0.036130</td>\n",
              "      <td>0.003763</td>\n",
              "      <td>0.00507</td>\n",
              "      <td>0.002542</td>\n",
              "      <td>0.002545</td>\n",
              "      <td>0.002649</td>\n",
              "      <td>0.002555</td>\n",
              "      <td>0.002935</td>\n",
              "      <td>0.003236</td>\n",
              "      <td>0.108910</td>\n",
              "      <td>0.003436</td>\n",
              "      <td>0.002551</td>\n",
              "      <td>0.002551</td>\n",
              "      <td>0.004643</td>\n",
              "      <td>0.002555</td>\n",
              "      <td>0.002561</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       File_Name         0         1  ...        19        20        21\n",
              "0  file_2300.txt  0.004492  0.003791  ...  0.115861  0.005520  0.004294\n",
              "1   file_809.txt  0.002903  0.002979  ...  0.004643  0.002555  0.002561\n",
              "\n",
              "[2 rows x 23 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 104
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "V6PdUgYkn67t",
        "colab_type": "text"
      },
      "source": [
        "###Ideas regarding the modelling part taken shamelessly from the below link:\n",
        "https://www.kaggle.com/sudalairajkumar/simple-feature-engg-notebook-spooky-author"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fCNdoSGvM9TF",
        "colab_type": "text"
      },
      "source": [
        "Major Contribution by SRK on Kaggle paid off in this competition."
      ]
    }
  ]
}